Load Data

First, let’s take a look at the data that is available to us.

Loading data from .csv files

How do you read a .csv into R?

# read in biographical data table
bio <- read_csv("https://raw.githubusercontent.com/majerus/apra_data_science_courses/master/bio_data_table.csv")

# read in giving data table
giving <- read_csv("https://raw.githubusercontent.com/majerus/apra_data_science_courses/master/giving_data_table.csv")

You can read multiple data files into the same R session. Each of these files contain fictional data created by the generate_data.R script.

Loading data from a database

Have you ever connected R to a database?

The following is an example of how to create a sample database in R and to load information from that database. More information on using databases from R can be found here. If you use the tidyverse, you can use the same workflow with information from databases and .csv or Excel files.

# create database connection
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":dbname:")

# put some data in our new database
copy_to(dest = con,
        df = bio,
        name = "bio_table",
        temporary = FALSE)

copy_to(dest = con,
        df = giving ,
        name = "giving_table",
        temporary = FALSE)

# print out our table names
db_list_tables(con)
## [1] "bio_table"    "giving_table" "sqlite_stat1" "sqlite_stat4"
# let's take a look at the bio table
tbl(con, "bio_table") 
## # Source:   table<bio_table> [?? x 14]
## # Database: sqlite 3.30.1 []
##        id name  household_id country city  birthday deceased zip   state   lat
##     <dbl> <chr>        <dbl> <chr>   <chr>    <dbl> <chr>    <chr> <chr> <dbl>
##  1 5.96e6 Smit…      1000259 United… Lumb…   -16640 Y        28358 NC     34.6
##  2 5.44e6 el-N…      1000279 United… West…   -16613 Y        19382 PA     40.0
##  3 3.95e6 al-D…      1000279 United… Ronk…       NA Y        11779 NY     40.8
##  4 4.13e6 Blac…      1000308 United… Colu…       NA Y        43207 OH     40.0
##  5 7.81e6 Aten…      1000308 United… De p…   -18532 Y        54115 WI     44.4
##  6 3.36e6 Ahma…      1000570 United… Gree…   -16585 Y        54302 WI     44.5
##  7 1.98e6 Han,…      1000758 United… Phoe…   -17676 <NA>     85035 AZ     33.5
##  8 7.67e6 Saxe…      1000758 United… Lake…   -17442 <NA>     55041 MN     44.4
##  9 7.87e6 Pedr…      1000913 United… Harp…   -16340 Y        48225 MI     42.4
## 10 2.03e6 Raib…      1000913 United… Manc…   -18381 Y        03103 NH     43.0
## # … with more rows, and 4 more variables: lon <dbl>, capacity <chr>,
## #   capacity_source <chr>, race <chr>
# we can use dplyr syntax to query a database
# dplyr automatically converts our r code to sql
# alternatively you can write sql code directly in rmarkdown as well
tbl(con, "bio_table") %>% 
  filter(state == "NC") %>% 
  select(name, city, capacity)
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.30.1 []
##    name                city         capacity   
##    <chr>               <chr>        <chr>      
##  1 Smith, Katalina     Lumberton    $50k - $75K
##  2 Silvis, Eric        Greenville   $5k - $10k 
##  3 Pinto, Anposahiyela Roxboro      $10k - $25k
##  4 Sullivan, Janetta   Troutman     $2.5k - $5k
##  5 el-Mina, Husaam     Huntersville $5k - $10k 
##  6 Littlejohn, Colton  Kinston      $10k - $25k
##  7 Marrs, Demetrius    Gastonia     <NA>       
##  8 Lovato, Lucero Rubi Statesville  $2.5k - $5k
##  9 Black, Kenneth      Salisbury    $50k - $75K
## 10 Dinakar, Hyeon      Harmony      $10k - $25k
## # … with more rows

Data Cleaning

Missing Values

How missing values work in R

R uses the NA code for missing values. You can test if a value is missing using the is.na() function.

How many missing values are there in the deceased variable?

is.na(bio$deceased)[1:100] 
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97]  TRUE FALSE FALSE  TRUE
sum(is.na(bio$deceased))
## [1] 10000
bio %>% 
  summarise(deceased_na = sum(is.na(deceased)))
## # A tibble: 1 x 1
##   deceased_na
##         <int>
## 1       10000
bio %>% 
  summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 14
##      id  name household_id country  city birthday deceased   zip state   lat
##   <int> <int>        <int>   <int> <int>    <int>    <int> <int> <int> <int>
## 1     0     0            0       0     0    10000    10000 10000 10000 10000
## # … with 4 more variables: lon <int>, capacity <int>, capacity_source <int>,
## #   race <int>
giving %>% 
  summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 6
##   household_id    id gift_id credit_type gift_amt gift_date
##          <int> <int>   <int>       <int>    <int>     <int>
## 1            0     0       0           0        0         0

Which records are missing zip, state, lat, and lon?

bio %>% 
  filter(is.na(zip)) %>% 
  glimpse()
## Rows: 10,000
## Columns: 14
## $ id              <dbl> 2307732, 5595799, 8993651, 4469674, 7406292, 6959349,…
## $ name            <chr> "al-Hassan, Rihaab", "Pavisook, Bethany", "Hands, Isa…
## $ household_id    <dbl> 1001089, 1003034, 1003287, 1003320, 1004096, 1004098,…
## $ country         <chr> "Mexico", "China", "India", "Pakistan", "Pakistan", "…
## $ city            <chr> "Mexico City", "Beijing", "Kolkata", "Faisalabad", "L…
## $ birthday        <date> 1923-12-24, 1920-09-20, 1921-10-26, NA, 1919-09-09, …
## $ deceased        <chr> "Y", "N", "Y", "Y", "Y", "Y", "N", "Y", "Y", "Y", "Y"…
## $ zip             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ state           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lat             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lon             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ capacity        <chr> "$2.5k - $5k", "$500k - $750k", "$500k - $750k", "$5k…
## $ capacity_source <chr> "institutional", "screening", "screening", "screening…
## $ race            <chr> "Non-Hispanic white", "Non-Hispanic white", "Hispanic…

The zipcode package can be used to get lat/lon coordinates for each zipcode’s centroid in the US. This data is also available here.

Data Cleaning

You can treat character, numeric, and factor variables seperately using variations of the select function.

bio %>% 
  select_if(is.numeric)
## # A tibble: 100,000 x 4
##         id household_id   lat    lon
##      <dbl>        <dbl> <dbl>  <dbl>
##  1 5961718      1000259  34.6  -79.0
##  2 5443595      1000279  40.0  -75.6
##  3 3946937      1000279  40.8  -73.1
##  4 4129813      1000308  40.0  -83.0
##  5 7813954      1000308  44.4  -88.1
##  6 3362879      1000570  44.5  -88.0
##  7 1980892      1000758  33.5 -112. 
##  8 7674576      1000758  44.4  -92.3
##  9 7870237      1000913  42.4  -82.9
## 10 2026476      1000913  43.0  -71.4
## # … with 99,990 more rows
bio %>% 
  select_if(is.character)
## # A tibble: 100,000 x 9
##    name    country  city   deceased zip   state capacity capacity_source race   
##    <chr>   <chr>    <chr>  <chr>    <chr> <chr> <chr>    <chr>           <chr>  
##  1 Smith,… United … Lumbe… Y        28358 NC    $50k - … screening       Black …
##  2 el-Nia… United … West … Y        19382 PA    $2.5k -… screening       Non-Hi…
##  3 al-Dib… United … Ronko… Y        11779 NY    $2.5k -… institutional   Two or…
##  4 Black,… United … Colum… Y        43207 OH    $250k -… institutional   Non-Hi…
##  5 Atenci… United … De pe… Y        54115 WI    $75k - … screening       Non-Hi…
##  6 Ahmad,… United … Green… Y        54302 WI    $75k - … screening       Non-Hi…
##  7 Han, L… United … Phoen… <NA>     85035 AZ    $2.5k -… institutional   Non-Hi…
##  8 Saxer,… United … Lake … <NA>     55041 MN    $25k - … screening       Non-Hi…
##  9 Pedraz… United … Harpe… Y        48225 MI    $25k - … screening       Hispan…
## 10 Raibur… United … Manch… Y        03103 NH    $25k - … institutional   Non-Hi…
## # … with 99,990 more rows

Which variable is but should not be a character?

bio %>% 
  select_if(is.character)  
## # A tibble: 100,000 x 9
##    name    country  city   deceased zip   state capacity capacity_source race   
##    <chr>   <chr>    <chr>  <chr>    <chr> <chr> <chr>    <chr>           <chr>  
##  1 Smith,… United … Lumbe… Y        28358 NC    $50k - … screening       Black …
##  2 el-Nia… United … West … Y        19382 PA    $2.5k -… screening       Non-Hi…
##  3 al-Dib… United … Ronko… Y        11779 NY    $2.5k -… institutional   Two or…
##  4 Black,… United … Colum… Y        43207 OH    $250k -… institutional   Non-Hi…
##  5 Atenci… United … De pe… Y        54115 WI    $75k - … screening       Non-Hi…
##  6 Ahmad,… United … Green… Y        54302 WI    $75k - … screening       Non-Hi…
##  7 Han, L… United … Phoen… <NA>     85035 AZ    $2.5k -… institutional   Non-Hi…
##  8 Saxer,… United … Lake … <NA>     55041 MN    $25k - … screening       Non-Hi…
##  9 Pedraz… United … Harpe… Y        48225 MI    $25k - … screening       Hispan…
## 10 Raibur… United … Manch… Y        03103 NH    $25k - … institutional   Non-Hi…
## # … with 99,990 more rows
bio <-
  bio %>% 
  mutate(zip = as.numeric(zip))

How might we recode the missing values for the deceased variable?

bio <-
  bio %>% 
  mutate(deceased_missing = ifelse(is.na(deceased), "Y", "N"),
         deceased = ifelse(is.na(deceased), "N", deceased)) 

Are certain capacity sources missing capacity information?

# capacity source 
bio %>% 
  count(capacity, capacity_source)
## # A tibble: 49 x 3
##    capacity      capacity_source     n
##    <chr>         <chr>           <int>
##  1 >$1k          institutional    1940
##  2 >$1k          screening        2983
##  3 >$1k          <NA>              520
##  4 $100k - $250k institutional    1645
##  5 $100k - $250k screening        2486
##  6 $100k - $250k <NA>              469
##  7 $10k - $25k   institutional    4840
##  8 $10k - $25k   screening        7416
##  9 $10k - $25k   <NA>             1377
## 10 $10M - $25M   institutional       1
## # … with 39 more rows
bio %>%
  filter(is.na(capacity_source)) %>% 
  count(capacity, capacity_source)
## # A tibble: 15 x 3
##    capacity      capacity_source     n
##    <chr>         <chr>           <int>
##  1 >$1k          <NA>              520
##  2 $100k - $250k <NA>              469
##  3 $10k - $25k   <NA>             1377
##  4 $10M - $25M   <NA>                2
##  5 $1k - $2.5k   <NA>              525
##  6 $1M - $2.5M   <NA>                7
##  7 $2.5k - $5k   <NA>              938
##  8 $250k - $500k <NA>              492
##  9 $25k - $50k   <NA>             1385
## 10 $500k - $750k <NA>              461
## 11 $50k - $75K   <NA>              883
## 12 $5k - $10k    <NA>              855
## 13 $750k - $1M   <NA>              240
## 14 $75k - $100k  <NA>              891
## 15 <NA>          <NA>              955

Let’s take a closer look at the birthday variable. What do you notice when we sort all birthdays in order?

# birthdays - let's sort all the birthdays in order 
bio %>% 
  select(birthday, deceased) %>% 
  arrange(birthday)
## # A tibble: 100,000 x 2
##    birthday   deceased
##    <date>     <chr>   
##  1 1900-01-01 Y       
##  2 1900-01-01 Y       
##  3 1900-01-01 Y       
##  4 1900-01-01 Y       
##  5 1900-01-01 Y       
##  6 1900-01-01 Y       
##  7 1900-01-01 Y       
##  8 1900-01-01 Y       
##  9 1900-01-01 Y       
## 10 1900-01-01 Y       
## # … with 99,990 more rows
# let's take a look at the distribution of birthdays
bio %>% 
  select(birthday) %>%  
  ggplot(aes(x = birthday)) +
  geom_histogram() 

# let's clean up what appears to be a missing value indicator 
bio <-
  bio %>%
  mutate(birthday = if_else(birthday == as.Date("1/1/1900", "%m/%d/%Y"),
                            as.Date(NA),
                            birthday))

# let's take another look
bio %>% 
  select(birthday) %>%  
  ggplot(aes(x = birthday)) +
  geom_histogram() 

Data Exploration

Small Multiples

# bio table - character variables bar plots
bio %>% 
  select_if(is.character) %>% 
  select(-name, -city) %>% 
  gather("variable", "value") %>% 
  ggplot(aes(x = value)) +
  geom_bar() +
  facet_wrap(~variable, scales = "free", nrow = 7) +
  theme(axis.text.y = element_text(size = 6)) +
  coord_flip()

What looks strange?

More Cleaning

# clean capacity ratings
sort(unique(bio$capacity))
##  [1] ">$1k"          "$100k - $250k" "$10k - $25k"   "$10M - $25M"  
##  [5] "$1k - $2.5k"   "$1M - $2.5M"   "$2.5k - $5k"   "$250k - $500k"
##  [9] "$25k - $50k"   "$500k - $750k" "$50k - $75K"   "$5k - $10k"   
## [13] "$5M - $10M"    "$750k - $1M"   "$75k - $100k"  "2.5M - $5M"
# demo multiple cursors

#  [1] ">$1k"
#  [2] "$100k - $250k"
#  [3] "$10k - $25k"
#  [4] "$10M - $25M"
#  [5] "$1k - $2.5k"
#  [6] "$1M - $2.5M"
#  [7] "$2.5k - $5k"
#  [8] "$250k - $500k"
#  [9] "$25k - $50k"
# [10] "$25M - $50M"
# [11] "$500k - $750k"
# [12] "$50k - $75K"
# [13] "$5k - $10k"
# [14] "$5M - $10M"
# [15] "$750k - $1M"
# [16] "$75k - $100k" 

bio <-
  bio %>% 
  mutate(capacity = factor(capacity, levels = c(">$1k",
                                                "$1k - $2.5k",
                                                "$2.5k - $5k",
                                                "$5k - $10k",
                                                "$10k - $25k",
                                                "$25k - $50k",
                                                "$50k - $75K",
                                                "$75k - $100k",
                                                "$100k - $250k",
                                                "$250k - $500k",
                                                "$500k - $750k",
                                                "$750k - $1M",
                                                "$1M - $2.5M",
                                                "2.5M - $5M",
                                                "$5M - $10M",
                                                "$10M - $25M",
                                                "$25M - $50M")))

# let's take another look at those capacities
bio %>% 
  select(capacity) %>% 
  ggplot(aes(x = capacity)) +
  geom_bar() +
  coord_flip()

Closer Look at State

# state
bio %>% 
  filter(!is.na(state)) %>% 
  count(state) %>% 
  arrange(desc(n)) %>% 
  slice(1:10) %>% 
  ggplot(aes(x = reorder(state, n),  y = n)) +
  geom_bar(stat = "identity", fill = "#027854") +
  coord_flip() +
  ggthemes::theme_tufte() +
  labs(y = "Number of Prospects", 
       x = "Primary Residence State",
       title = "Prospects by State")

Is this right? Do we need to exclude some prospects?

# state
state_plot <- 
bio %>% 
  filter(!is.na(state),
         deceased == "N",
         !duplicated(household_id)) %>% 
  count(state) %>% 
  arrange(desc(n)) %>% 
  slice(1:10) %>% 
  ggplot(aes(x = reorder(state, n),  y = n)) +
  geom_bar(stat = "identity", fill = "#027854") +
  coord_flip() +
  ggthemes::theme_tufte() +
  labs(y = "Number of Prospects", 
       x = "Primary Residence State",
       title = "Prospects by State")

ggplotly(state_plot)

Giving Data

How would we plot the distribution of gift dates (i.e., the number of gifts per day)?

# gifts per day
giving %>% 
  filter(credit_type == "Hard-Credit") %>% 
  ggplot(aes(x = gift_date)) +
  geom_histogram()

How about the distribution of gift amounts?

giving %>% 
  filter(credit_type == "Hard-Credit") %>% 
  ggplot(aes(x = gift_amt)) +
  geom_histogram()

giving %>% 
  filter(credit_type == "Hard-Credit",
         gift_amt < 1000000) %>% 
  ggplot(aes(x = gift_amt)) +
  geom_histogram()

giving %>% 
  filter(credit_type == "Hard-Credit",
         gift_amt < 100000) %>% 
  ggplot(aes(x = gift_amt)) +
  geom_histogram()

How does our fundraising progress compare to previous fiscal years?

What’s our first step?

giving <-
  giving %>% 
  mutate(fy = ifelse(month(gift_date) >= 7, 
                     year(gift_date) +1, 
                     year(gift_date)))

giving %>% 
  count(fy)
## # A tibble: 6 x 2
##      fy     n
##   <dbl> <int>
## 1  2016 66498
## 2  2017 75654
## 3  2018 75187
## 4  2019 75578
## 5  2020 75984
## 6  2021  9100
giving %>% 
  filter(credit_type == "Hard-Credit") %>% 
  group_by(fy) %>% 
  summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 6 x 2
##      fy total_giving
##   <dbl> <chr>       
## 1  2016 $788,796,803
## 2  2017 $946,349,616
## 3  2018 $903,744,855
## 4  2019 $849,433,928
## 5  2020 $860,076,232
## 6  2021 $105,749,432

Is this it? What else might we need to account for?

calculateFY <- function(date = Sys.Date(), date.format = "%Y-%m-%d", ytd = FALSE, 
                        fiscal.year = ifelse(month(Sys.Date()) >= 7,  year(Sys.Date()) +1, year(Sys.Date()))){ 
  
  date <- as.Date(date, date.format)
  
  fy.date <- 
    ifelse(month(date) %in% c(1:6), 
           year(date),
           year(date) + 1)
  
  if(ytd == TRUE){
    
    fy <- fiscal.year
    
    end.this.fy  <- as.Date(paste0("6/30/", fy), format = "%m/%d/%Y")
    
    days.left.this.fy <- end.this.fy - Sys.Date()
    
    end.date.fy  <- as.Date(paste0("6/30/", fy.date), format = "%m/%d/%Y")
    
    days.left.date.fy <- end.date.fy - date
    
    if(days.left.date.fy >= days.left.this.fy){
      
      return(fy.date)
    
    }else{
        
      return(NA)
      
      }
    
  }else{
    
    return(fy.date)
    
  }
  
}
# giving$fy <- unlist(lapply(giving$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))

ytd_table <- tibble(
  gift_date = seq(min(giving$gift_date), max(giving$gift_date), by = "day"),
)

ytd_table$fy_ytd <- unlist(lapply(ytd_table$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))

giving %>% 
  left_join(ytd_table) %>% 
  filter(!is.na(fy_ytd)) %>% 
  filter(credit_type == "Hard-Credit") %>% 
  group_by(fy_ytd) %>% 
  summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 5 x 2
##   fy_ytd total_giving
##    <dbl> <chr>       
## 1   2017 $106,580,994
## 2   2018 $104,495,699
## 3   2019 $106,861,990
## 4   2020 $112,001,699
## 5   2021 $105,749,432

There is a fundraising R package in development that may help and is available here.

Which prospects should we rate next?

What might our first step be?

# calculate annual and total giving
# see who is not rated or rated low
giving_by_household_and_fy <- 
  giving %>% 
  group_by(household_id, fy) %>% 
  summarise(giving = sum(gift_amt)) %>% 
  spread(fy, giving, sep = "") %>% 
  ungroup() %>% 
  mutate(total_giving = rowSums(select(., contains("fy")), na.rm = TRUE))

sum(duplicated(giving_by_household_and_fy$household_id))
## [1] 0
bio_with_household_giving <- 
  bio %>% 
  filter(!duplicated(household_id)) %>% 
  left_join(giving_by_household_and_fy)


bio_with_household_giving %>% 
  filter(capacity_source %in% c(NA, "screening")) %>% 
  filter(total_giving > 10000) %>% 
  filter(!is.na(fy2019)) %>% 
  arrange(desc(total_giving)) %>% 
  select(name, capacity, capacity_source, contains("fy"), total_giving) %>% 
  datatable(rownames = FALSE) %>% 
  formatCurrency(columns = c(3:10), digits = 0)

Where should we host a fundraising event?

What might our first step be?

bio_with_household_giving %>% 
  filter(total_giving > 10000) %>% 
  filter(!is.na(fy2019)) %>% 
  leaflet() %>% 
  addTiles() %>% 
  addCircleMarkers(clusterOptions = markerClusterOptions(),
                   label = ~paste0(name, ": ", scales::dollar(total_giving)))